1 package org.apache.lucene.search.spell;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.regex.Pattern;
23
24 import junit.framework.Assert;
25
26 import org.apache.lucene.analysis.Analyzer;
27 import org.apache.lucene.analysis.MockAnalyzer;
28 import org.apache.lucene.analysis.MockTokenizer;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.document.Field;
31 import org.apache.lucene.index.DirectoryReader;
32 import org.apache.lucene.index.IndexReader;
33 import org.apache.lucene.index.RandomIndexWriter;
34 import org.apache.lucene.index.Term;
35 import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
36 import org.apache.lucene.store.Directory;
37 import org.apache.lucene.util.English;
38 import org.apache.lucene.util.IOUtils;
39 import org.apache.lucene.util.LuceneTestCase;
40 import org.apache.lucene.util.TestUtil;
41
42 public class TestWordBreakSpellChecker extends LuceneTestCase {
43 private Directory dir;
44 private Analyzer analyzer;
45
46 @Override
47 public void setUp() throws Exception {
48 super.setUp();
49 dir = newDirectory();
50 analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
51 RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
52
53 for (int i = 900; i < 1112; i++) {
54 Document doc = new Document();
55 String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
56 doc.add(newTextField("numbers", num, Field.Store.NO));
57 writer.addDocument(doc);
58 }
59
60 {
61 Document doc = new Document();
62 doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO));
63 writer.addDocument(doc);
64 }
65 {
66 Document doc = new Document();
67 doc.add(newTextField("numbers", "hundredeight eightyeight yeight", Field.Store.NO));
68 writer.addDocument(doc);
69 }
70 {
71 Document doc = new Document();
72 doc.add(newTextField("numbers", "tres y cinco", Field.Store.NO));
73 writer.addDocument(doc);
74 }
75
76 writer.commit();
77 writer.close();
78 }
79
80 @Override
81 public void tearDown() throws Exception {
82 IOUtils.close(dir, analyzer);
83 super.tearDown();
84 }
85
86 public void testCombiningWords() throws Exception {
87 IndexReader ir = DirectoryReader.open(dir);
88 WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
89
90 {
91 Term[] terms = {
92 new Term("numbers", "one"),
93 new Term("numbers", "hun"),
94 new Term("numbers", "dred"),
95 new Term("numbers", "eight"),
96 new Term("numbers", "y"),
97 new Term("numbers", "eight"),
98 };
99 wbsp.setMaxChanges(3);
100 wbsp.setMaxCombineWordLength(20);
101 wbsp.setMinSuggestionFrequency(1);
102 CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
103 Assert.assertTrue(cs.length==5);
104
105 Assert.assertTrue(cs[0].originalTermIndexes.length==2);
106 Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
107 Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
108 Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
109 Assert.assertTrue(cs[0].suggestion.score==1);
110
111 Assert.assertTrue(cs[1].originalTermIndexes.length==2);
112 Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
113 Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
114 Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
115 Assert.assertTrue(cs[1].suggestion.score==1);
116
117 Assert.assertTrue(cs[2].originalTermIndexes.length==2);
118 Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
119 Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
120 Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
121 Assert.assertTrue(cs[2].suggestion.score==1);
122
123 for(int i=3 ; i<5 ; i++) {
124 Assert.assertTrue(cs[i].originalTermIndexes.length==3);
125 Assert.assertTrue(cs[i].suggestion.score==2);
126 Assert.assertTrue(
127 (cs[i].originalTermIndexes[0]==1 &&
128 cs[i].originalTermIndexes[1]==2 &&
129 cs[i].originalTermIndexes[2]==3 &&
130 cs[i].suggestion.string.equals("hundredeight")) ||
131 (cs[i].originalTermIndexes[0]==3 &&
132 cs[i].originalTermIndexes[1]==4 &&
133 cs[i].originalTermIndexes[2]==5 &&
134 cs[i].suggestion.string.equals("eightyeight"))
135 );
136 }
137
138 cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
139 Assert.assertTrue(cs.length==2);
140 Assert.assertTrue(cs[0].originalTermIndexes.length==2);
141 Assert.assertTrue(cs[0].suggestion.score==1);
142 Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
143 Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
144 Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
145 Assert.assertTrue(cs[0].suggestion.score==1);
146
147 Assert.assertTrue(cs[1].originalTermIndexes.length==3);
148 Assert.assertTrue(cs[1].suggestion.score==2);
149 Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
150 Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
151 Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
152 Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
153 }
154 ir.close();
155 }
156
157 public void testBreakingWords() throws Exception {
158 IndexReader ir = DirectoryReader.open(dir);
159 WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
160
161 {
162 Term term = new Term("numbers", "ninetynine");
163 wbsp.setMaxChanges(1);
164 wbsp.setMinBreakWordLength(1);
165 wbsp.setMinSuggestionFrequency(1);
166 SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
167 Assert.assertTrue(sw.length==1);
168 Assert.assertTrue(sw[0].length==2);
169 Assert.assertTrue(sw[0][0].string.equals("ninety"));
170 Assert.assertTrue(sw[0][1].string.equals("nine"));
171 Assert.assertTrue(sw[0][0].score == 1);
172 Assert.assertTrue(sw[0][1].score == 1);
173 }
174 {
175 Term term = new Term("numbers", "onethousand");
176 wbsp.setMaxChanges(1);
177 wbsp.setMinBreakWordLength(1);
178 wbsp.setMinSuggestionFrequency(1);
179 SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
180 Assert.assertTrue(sw.length==1);
181 Assert.assertTrue(sw[0].length==2);
182 Assert.assertTrue(sw[0][0].string.equals("one"));
183 Assert.assertTrue(sw[0][1].string.equals("thousand"));
184 Assert.assertTrue(sw[0][0].score == 1);
185 Assert.assertTrue(sw[0][1].score == 1);
186
187 wbsp.setMaxChanges(2);
188 wbsp.setMinSuggestionFrequency(1);
189 sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
190 Assert.assertTrue(sw.length==1);
191 Assert.assertTrue(sw[0].length==2);
192
193 wbsp.setMaxChanges(2);
194 wbsp.setMinSuggestionFrequency(2);
195 sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
196 Assert.assertTrue(sw.length==1);
197 Assert.assertTrue(sw[0].length==2);
198
199 wbsp.setMaxChanges(2);
200 wbsp.setMinSuggestionFrequency(1);
201 sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
202 Assert.assertTrue(sw.length==2);
203 Assert.assertTrue(sw[0].length==2);
204 Assert.assertTrue(sw[0][0].string.equals("one"));
205 Assert.assertTrue(sw[0][1].string.equals("thousand"));
206 Assert.assertTrue(sw[0][0].score == 1);
207 Assert.assertTrue(sw[0][1].score == 1);
208 Assert.assertTrue(sw[0][1].freq>1);
209 Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
210 Assert.assertTrue(sw[1].length==3);
211 Assert.assertTrue(sw[1][0].string.equals("one"));
212 Assert.assertTrue(sw[1][1].string.equals("thou"));
213 Assert.assertTrue(sw[1][2].string.equals("sand"));
214 Assert.assertTrue(sw[1][0].score == 2);
215 Assert.assertTrue(sw[1][1].score == 2);
216 Assert.assertTrue(sw[1][2].score == 2);
217 Assert.assertTrue(sw[1][0].freq>1);
218 Assert.assertTrue(sw[1][1].freq==1);
219 Assert.assertTrue(sw[1][2].freq==1);
220 }
221 {
222 Term term = new Term("numbers", "onethousandonehundredeleven");
223 wbsp.setMaxChanges(3);
224 wbsp.setMinBreakWordLength(1);
225 wbsp.setMinSuggestionFrequency(1);
226 SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
227 Assert.assertTrue(sw.length==0);
228
229 wbsp.setMaxChanges(4);
230 sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
231 Assert.assertTrue(sw.length==1);
232 Assert.assertTrue(sw[0].length==5);
233
234 wbsp.setMaxChanges(5);
235 sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
236 Assert.assertTrue(sw.length==2);
237 Assert.assertTrue(sw[0].length==5);
238 Assert.assertTrue(sw[0][1].string.equals("thousand"));
239 Assert.assertTrue(sw[1].length==6);
240 Assert.assertTrue(sw[1][1].string.equals("thou"));
241 Assert.assertTrue(sw[1][2].string.equals("sand"));
242 }
243 {
244
245 Term term = new Term("numbers", "\uD864\uDC79");
246 wbsp.setMaxChanges(1);
247 wbsp.setMinBreakWordLength(1);
248 wbsp.setMinSuggestionFrequency(1);
249 SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
250 Assert.assertTrue(sw.length==0);
251 }
252
253 ir.close();
254 }
255
256 public void testRandom() throws Exception {
257 int numDocs = TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
258 (100 * RANDOM_MULTIPLIER));
259 IndexReader ir = null;
260
261 Directory dir = newDirectory();
262 Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
263 RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
264 int maxLength = TestUtil.nextInt(random(), 5, 50);
265 List<String> originals = new ArrayList<>(numDocs);
266 List<String[]> breaks = new ArrayList<>(numDocs);
267 for (int i = 0; i < numDocs; i++) {
268 String orig = "";
269 if (random().nextBoolean()) {
270 while (!goodTestString(orig)) {
271 orig = TestUtil.randomSimpleString(random(), maxLength);
272 }
273 } else {
274 while (!goodTestString(orig)) {
275 orig = TestUtil.randomUnicodeString(random(), maxLength);
276 }
277 }
278 originals.add(orig);
279 int totalLength = orig.codePointCount(0, orig.length());
280 int breakAt = orig.offsetByCodePoints(0,
281 TestUtil.nextInt(random(), 1, totalLength - 1));
282 String[] broken = new String[2];
283 broken[0] = orig.substring(0, breakAt);
284 broken[1] = orig.substring(breakAt);
285 breaks.add(broken);
286 Document doc = new Document();
287 doc.add(newTextField("random_break", broken[0] + " " + broken[1],
288 Field.Store.NO));
289 doc.add(newTextField("random_combine", orig, Field.Store.NO));
290 writer.addDocument(doc);
291 }
292 writer.commit();
293 writer.close();
294
295 ir = DirectoryReader.open(dir);
296 WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
297 wbsp.setMaxChanges(1);
298 wbsp.setMinBreakWordLength(1);
299 wbsp.setMinSuggestionFrequency(1);
300 wbsp.setMaxCombineWordLength(maxLength);
301 for (int i = 0; i < originals.size(); i++) {
302 String orig = originals.get(i);
303 String left = breaks.get(i)[0];
304 String right = breaks.get(i)[1];
305 {
306 Term term = new Term("random_break", orig);
307
308 SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
309 ir, SuggestMode.SUGGEST_ALWAYS,
310 BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
311 boolean failed = true;
312 for (SuggestWord[] sw1 : sw) {
313 Assert.assertTrue(sw1.length == 2);
314 if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
315 failed = false;
316 }
317 }
318 Assert.assertFalse("Failed getting break suggestions\n >Original: "
319 + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
320 }
321 {
322 Term[] terms = {new Term("random_combine", left),
323 new Term("random_combine", right)};
324 CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
325 originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
326 boolean failed = true;
327 for (CombineSuggestion cs1 : cs) {
328 Assert.assertTrue(cs1.originalTermIndexes.length == 2);
329 if (cs1.suggestion.string.equals(left + right)) {
330 failed = false;
331 }
332 }
333 Assert.assertFalse("Failed getting combine suggestions\n >Original: "
334 + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
335 }
336 }
337 IOUtils.close(ir, dir, analyzer);
338 }
339
340 private static final Pattern mockTokenizerWhitespacePattern = Pattern
341 .compile("[ \\t\\r\\n]");
342
343 private boolean goodTestString(String s) {
344 if (s.codePointCount(0, s.length()) < 2
345 || mockTokenizerWhitespacePattern.matcher(s).find()) {
346 return false;
347 }
348 return true;
349 }
350 }